# library(shiny)
library(dplyr)
library(ggplot2)
library(tidyr)
library(reshape2) # reshaping the data 
library(corrplot) # correlation matrix 
library(car) # applied regression 

data_1 <- read.csv(file = "./salary_data_cleaned.csv") # 
data_2 <- read.csv(file = "./glassdoor_jobs.csv") # 
data_3 <- read.csv(file = "./eda_data.csv") #  data science job postings from Glassdoor.com for 2017-2018 33 variables 
data_4 <- read.csv(file = "./salaries_2.csv")  # data scientist salaries for 2024
head(data_1, 10)
head(data_2, 5)
head(data_3, 5)
head(data_4, 5)
# checking for missing values in each dataframe 
sum(is.na(data_1))
[1] 0
sum(is.na(data_2))
[1] 0
sum(is.na(data_3))
[1] 0
sum(is.na(data_4))
[1] 0
# check if there are empty strings and replace them with NA
# dplyr::mutate_all(data_1, list(~na_if(.,"")))
data_1 %>% dplyr::mutate_if(is.character, list(~na_if(.,""))) 
data_2 %>% dplyr::mutate_if(is.character, list(~na_if(.,""))) 
data_3 %>% dplyr::mutate_if(is.character, list(~na_if(.,""))) 
data_4 %>% dplyr::mutate_if(is.character, list(~na_if(.,""))) 
sum(is.na(data_1))
[1] 0
sum(is.na(data_2))
[1] 0
sum(is.na(data_3))
[1] 0
sum(is.na(data_4))
[1] 0
summary.data.frame(data_1)
  Job.Title         Salary.Estimate    Job.Description   
 Length:742         Length:742         Length:742        
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  
                                                         
                                                         
                                                         
     Rating       Company.Name         Location        
 Min.   :-1.000   Length:742         Length:742        
 1st Qu.: 3.300   Class :character   Class :character  
 Median : 3.700   Mode  :character   Mode  :character  
 Mean   : 3.619                                        
 3rd Qu.: 4.000                                        
 Max.   : 5.000                                        
 Headquarters           Size              Founded    
 Length:742         Length:742         Min.   :  -1  
 Class :character   Class :character   1st Qu.:1939  
 Mode  :character   Mode  :character   Median :1988  
                                       Mean   :1837  
                                       3rd Qu.:2007  
                                       Max.   :2019  
 Type.of.ownership    Industry            Sector         
 Length:742         Length:742         Length:742        
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  
                                                         
                                                         
                                                         
   Revenue          Competitors            hourly       
 Length:742         Length:742         Min.   :0.00000  
 Class :character   Class :character   1st Qu.:0.00000  
 Mode  :character   Mode  :character   Median :0.00000  
                                       Mean   :0.03234  
                                       3rd Qu.:0.00000  
                                       Max.   :1.00000  
 employer_provided   min_salary       max_salary   
 Min.   :0.00000   Min.   : 10.00   Min.   : 16.0  
 1st Qu.:0.00000   1st Qu.: 52.00   1st Qu.: 96.0  
 Median :0.00000   Median : 69.50   Median :124.0  
 Mean   :0.02291   Mean   : 74.07   Mean   :127.2  
 3rd Qu.:0.00000   3rd Qu.: 91.00   3rd Qu.:155.0  
 Max.   :1.00000   Max.   :202.00   Max.   :306.0  
   avg_salary    company_txt         job_state        
 Min.   : 13.5   Length:742         Length:742        
 1st Qu.: 73.5   Class :character   Class :character  
 Median : 97.5   Mode  :character   Mode  :character  
 Mean   :100.6                                        
 3rd Qu.:122.5                                        
 Max.   :254.0                                        
   same_state         age           python_yn     
 Min.   :0.000   Min.   : -1.00   Min.   :0.0000  
 1st Qu.:0.000   1st Qu.: 11.00   1st Qu.:0.0000  
 Median :1.000   Median : 24.00   Median :1.0000  
 Mean   :0.558   Mean   : 46.59   Mean   :0.5283  
 3rd Qu.:1.000   3rd Qu.: 59.00   3rd Qu.:1.0000  
 Max.   :1.000   Max.   :276.00   Max.   :1.0000  
      R_yn              spark             aws        
 Min.   :0.000000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:0.0000  
 Median :0.000000   Median :0.0000   Median :0.0000  
 Mean   :0.002695   Mean   :0.2251   Mean   :0.2372  
 3rd Qu.:0.000000   3rd Qu.:0.0000   3rd Qu.:0.0000  
 Max.   :1.000000   Max.   :1.0000   Max.   :1.0000  
     excel       
 Min.   :0.0000  
 1st Qu.:0.0000  
 Median :1.0000  
 Mean   :0.5229  
 3rd Qu.:1.0000  
 Max.   :1.0000  
summary.data.frame(data_2)
       X          Job.Title         Salary.Estimate   
 Min.   :  0.0   Length:956         Length:956        
 1st Qu.:238.8   Class :character   Class :character  
 Median :477.5   Mode  :character   Mode  :character  
 Mean   :477.5                                        
 3rd Qu.:716.2                                        
 Max.   :955.0                                        
 Job.Description        Rating       Company.Name      
 Length:956         Min.   :-1.000   Length:956        
 Class :character   1st Qu.: 3.300   Class :character  
 Mode  :character   Median : 3.800   Mode  :character  
                    Mean   : 3.601                     
                    3rd Qu.: 4.200                     
                    Max.   : 5.000                     
   Location         Headquarters           Size          
 Length:956         Length:956         Length:956        
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  
                                                         
                                                         
                                                         
    Founded     Type.of.ownership    Industry        
 Min.   :  -1   Length:956         Length:956        
 1st Qu.:1937   Class :character   Class :character  
 Median :1992   Mode  :character   Mode  :character  
 Mean   :1775                                        
 3rd Qu.:2008                                        
 Max.   :2019                                        
    Sector            Revenue          Competitors       
 Length:956         Length:956         Length:956        
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  
                                                         
                                                         
                                                         
summary.data.frame(data_3)
       X          Job.Title         Salary.Estimate   
 Min.   :  0.0   Length:742         Length:742        
 1st Qu.:185.2   Class :character   Class :character  
 Median :370.5   Mode  :character   Mode  :character  
 Mean   :370.5                                        
 3rd Qu.:555.8                                        
 Max.   :741.0                                        
 Job.Description        Rating       Company.Name      
 Length:742         Min.   :-1.000   Length:742        
 Class :character   1st Qu.: 3.300   Class :character  
 Mode  :character   Median : 3.700   Mode  :character  
                    Mean   : 3.619                     
                    3rd Qu.: 4.000                     
                    Max.   : 5.000                     
   Location         Headquarters           Size          
 Length:742         Length:742         Length:742        
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  
                                                         
                                                         
                                                         
    Founded     Type.of.ownership    Industry        
 Min.   :  -1   Length:742         Length:742        
 1st Qu.:1939   Class :character   Class :character  
 Median :1988   Mode  :character   Mode  :character  
 Mean   :1837                                        
 3rd Qu.:2007                                        
 Max.   :2019                                        
    Sector            Revenue          Competitors       
 Length:742         Length:742         Length:742        
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  
                                                         
                                                         
                                                         
     hourly        employer_provided   min_salary    
 Min.   :0.00000   Min.   :0.00000   Min.   : 15.00  
 1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.: 52.00  
 Median :0.00000   Median :0.00000   Median : 69.50  
 Mean   :0.03234   Mean   :0.02291   Mean   : 74.72  
 3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.: 91.00  
 Max.   :1.00000   Max.   :1.00000   Max.   :202.00  
   max_salary      avg_salary    company_txt       
 Min.   : 16.0   Min.   : 13.5   Length:742        
 1st Qu.: 96.0   1st Qu.: 73.5   Class :character  
 Median :124.0   Median : 97.5   Mode  :character  
 Mean   :128.1   Mean   :100.6                     
 3rd Qu.:155.0   3rd Qu.:122.5                     
 Max.   :306.0   Max.   :254.0                     
  job_state           same_state         age        
 Length:742         Min.   :0.000   Min.   : -1.00  
 Class :character   1st Qu.:0.000   1st Qu.: 11.00  
 Mode  :character   Median :1.000   Median : 24.00  
                    Mean   :0.558   Mean   : 46.59  
                    3rd Qu.:1.000   3rd Qu.: 59.00  
                    Max.   :1.000   Max.   :276.00  
   python_yn           R_yn              spark       
 Min.   :0.0000   Min.   :0.000000   Min.   :0.0000  
 1st Qu.:0.0000   1st Qu.:0.000000   1st Qu.:0.0000  
 Median :1.0000   Median :0.000000   Median :0.0000  
 Mean   :0.5283   Mean   :0.002695   Mean   :0.2251  
 3rd Qu.:1.0000   3rd Qu.:0.000000   3rd Qu.:0.0000  
 Max.   :1.0000   Max.   :1.000000   Max.   :1.0000  
      aws             excel          job_simp        
 Min.   :0.0000   Min.   :0.0000   Length:742        
 1st Qu.:0.0000   1st Qu.:0.0000   Class :character  
 Median :0.0000   Median :1.0000   Mode  :character  
 Mean   :0.2372   Mean   :0.5229                     
 3rd Qu.:0.0000   3rd Qu.:1.0000                     
 Max.   :1.0000   Max.   :1.0000                     
  seniority            desc_len        num_comp    
 Length:742         Min.   :  407   Min.   :0.000  
 Class :character   1st Qu.: 2801   1st Qu.:0.000  
 Mode  :character   Median : 3731   Median :0.000  
                    Mean   : 3870   Mean   :1.054  
                    3rd Qu.: 4740   3rd Qu.:3.000  
                    Max.   :10051   Max.   :4.000  
summary.data.frame(data_4)
   work_year    experience_level   employment_type   
 Min.   :2020   Length:16494       Length:16494      
 1st Qu.:2023   Class :character   Class :character  
 Median :2023   Mode  :character   Mode  :character  
 Mean   :2023                                        
 3rd Qu.:2024                                        
 Max.   :2024                                        
  job_title             salary         salary_currency   
 Length:16494       Min.   :   14000   Length:16494      
 Class :character   1st Qu.:  102000   Class :character  
 Mode  :character   Median :  142200   Mode  :character  
                    Mean   :  163788                     
                    3rd Qu.:  187342                     
                    Max.   :30400000                     
 salary_in_usd    employee_residence  remote_ratio   
 Min.   : 15000   Length:16494       Min.   :  0.00  
 1st Qu.:101518   Class :character   1st Qu.:  0.00  
 Median :141300   Mode  :character   Median :  0.00  
 Mean   :149714                      Mean   : 32.04  
 3rd Qu.:185900                      3rd Qu.:100.00  
 Max.   :800000                      Max.   :100.00  
 company_location   company_size      
 Length:16494       Length:16494      
 Class :character   Class :character  
 Mode  :character   Mode  :character  
                                      
                                      
                                      
# dropping column X from data_3
data_3 <- data_3 %>% select(-X)
summary.data.frame(data_3)
  Job.Title         Salary.Estimate    Job.Description   
 Length:742         Length:742         Length:742        
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  
                                                         
                                                         
                                                         
     Rating       Company.Name         Location        
 Min.   :-1.000   Length:742         Length:742        
 1st Qu.: 3.300   Class :character   Class :character  
 Median : 3.700   Mode  :character   Mode  :character  
 Mean   : 3.619                                        
 3rd Qu.: 4.000                                        
 Max.   : 5.000                                        
 Headquarters           Size              Founded    
 Length:742         Length:742         Min.   :  -1  
 Class :character   Class :character   1st Qu.:1939  
 Mode  :character   Mode  :character   Median :1988  
                                       Mean   :1837  
                                       3rd Qu.:2007  
                                       Max.   :2019  
 Type.of.ownership    Industry            Sector         
 Length:742         Length:742         Length:742        
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  
                                                         
                                                         
                                                         
   Revenue          Competitors            hourly       
 Length:742         Length:742         Min.   :0.00000  
 Class :character   Class :character   1st Qu.:0.00000  
 Mode  :character   Mode  :character   Median :0.00000  
                                       Mean   :0.03234  
                                       3rd Qu.:0.00000  
                                       Max.   :1.00000  
 employer_provided   min_salary       max_salary   
 Min.   :0.00000   Min.   : 15.00   Min.   : 16.0  
 1st Qu.:0.00000   1st Qu.: 52.00   1st Qu.: 96.0  
 Median :0.00000   Median : 69.50   Median :124.0  
 Mean   :0.02291   Mean   : 74.72   Mean   :128.1  
 3rd Qu.:0.00000   3rd Qu.: 91.00   3rd Qu.:155.0  
 Max.   :1.00000   Max.   :202.00   Max.   :306.0  
   avg_salary    company_txt         job_state        
 Min.   : 13.5   Length:742         Length:742        
 1st Qu.: 73.5   Class :character   Class :character  
 Median : 97.5   Mode  :character   Mode  :character  
 Mean   :100.6                                        
 3rd Qu.:122.5                                        
 Max.   :254.0                                        
   same_state         age           python_yn     
 Min.   :0.000   Min.   : -1.00   Min.   :0.0000  
 1st Qu.:0.000   1st Qu.: 11.00   1st Qu.:0.0000  
 Median :1.000   Median : 24.00   Median :1.0000  
 Mean   :0.558   Mean   : 46.59   Mean   :0.5283  
 3rd Qu.:1.000   3rd Qu.: 59.00   3rd Qu.:1.0000  
 Max.   :1.000   Max.   :276.00   Max.   :1.0000  
      R_yn              spark             aws        
 Min.   :0.000000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:0.0000  
 Median :0.000000   Median :0.0000   Median :0.0000  
 Mean   :0.002695   Mean   :0.2251   Mean   :0.2372  
 3rd Qu.:0.000000   3rd Qu.:0.0000   3rd Qu.:0.0000  
 Max.   :1.000000   Max.   :1.0000   Max.   :1.0000  
     excel          job_simp          seniority        
 Min.   :0.0000   Length:742         Length:742        
 1st Qu.:0.0000   Class :character   Class :character  
 Median :1.0000   Mode  :character   Mode  :character  
 Mean   :0.5229                                        
 3rd Qu.:1.0000                                        
 Max.   :1.0000                                        
    desc_len        num_comp    
 Min.   :  407   Min.   :0.000  
 1st Qu.: 2801   1st Qu.:0.000  
 Median : 3731   Median :0.000  
 Mean   : 3870   Mean   :1.054  
 3rd Qu.: 4740   3rd Qu.:3.000  
 Max.   :10051   Max.   :4.000  
# how many different companies are in data_1
length(unique(data_1$company_txt))
[1] 343
# how many different industies in data_1
length(unique(data_1$Industry))
[1] 60
# how many different sectors in data_1
length(unique(data_1$Sector))
[1] 25
# how many different job titels in data_1
length(unique(data_1$Job.Title))
[1] 264
# how many different states in data_1
length(unique(data_1$job_state))
[1] 38
# heat map to see the correlation between variables 
# create a subset for data_1 with numeric variables only

data_numeric <- select_if(data_1, is.numeric) 

data_numeric <- data_numeric[, !names(data_numeric) %in% c("Founded","hourly", "employer_provided", "same_state", "age")]

# data_melted <- melt(data_numeric)

# correlation matrix 
cor_matrix <- cor(data_numeric)
cor_melted <- melt(cor_matrix, varnames = c("Var1", "Var2"), value.name = "Correlation")

ggplot(data = cor_melted, aes(x = Var1, y = Var2, fill = Correlation)) +
  geom_tile() +
  labs(title = "Correlation Heatmap",
       x = "Variable",
       y = "Variable") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1)) +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name = "Correlation") +
   geom_text(aes(label = round(Correlation, 2)), color = "black", size = 2)

python_jobs <- data_1[data_1$python_yn ==1, ]
rate_python = ((nrow(python_jobs))/nrow(data_1))*100

r_jobs <- data_1[data_1$R_yn ==1, ]
rate_r = ((nrow(r_jobs))/nrow(data_1))*100

spark_jobs <- data_1[data_1$spark ==1, ]
rate_spark = ((nrow(spark_jobs))/nrow(data_1))*100

aws_jobs <- data_1[data_1$aws ==1, ]
rate_aws = ((nrow(aws_jobs))/nrow(data_1))*100

excel_jobs <- data_1[data_1$excel == 1, ]
rate_excel = ((nrow(excel_jobs))/nrow(data_1))*100

skills_data <- data.frame(
  skill = c("Python", "R", "Spark", "AWS", "Excel"),
  rate = c(rate_python, rate_r, rate_spark, rate_aws, rate_excel)
)
ggplot(skills_data, aes(x = skill, y = rate, fill = skill)) +
  geom_bar(stat = "identity")+
  theme_light()+
  labs(title = "Proportion of Skills Required", x = "Skill", y = "Rate (%)") +
  geom_text(aes(label = paste0(round(rate, 1), "%")), 
             position = position_stack(vjust = 0.5),size=3)

# multiple skills vs salary 
# 2 skills vs 3 skills vs 4 skills
# how many skills in each row are true, create new column 
filter_skills <- function(data, skill_count) {
  data %>%
    rowwise() %>%
    mutate(skill_count = sum(c_across(c(python_yn, R_yn, spark, aws, excel)) == 1)) %>%
    ungroup() 
    
}


  data_skills <- filter_skills(data_1) %>%
    filter(skill_count >= 0 & skill_count <= 5)
  ggplot(data_skills, aes(x = factor(skill_count), y = avg_salary)) +
    geom_boxplot() +
    theme_minimal() +
    labs(title = paste(i, "Number of skills vs Avg Salary"), 
         x = "Number of Skills", 
         y = "Average Salary(thousands)")+
  theme(plot.title = element_text(hjust = 0.5))

NA
NA

data_processed <- data_3 %>%
  select(Rating, avg_salary, job_simp, job_state, python_yn, R_yn, spark, aws, excel, Industry, Revenue)

# convert categorical variables to factors
data_processed$job_simp <- as.factor(data_processed$job_simp)
data_processed$job_state <- as.factor(data_processed$job_state)
data_processed$Industry <- as.factor(data_processed$Industry)
data_processed$Revenue <- as.factor(data_processed$Revenue)
data_processed$python_yn <- as.factor(data_processed$python_yn)
data_processed$R_yn <- as.factor(data_processed$R_yn)
data_processed$spark <- as.factor(data_processed$spark)
data_processed$aws <- as.factor(data_processed$aws)
data_processed$excel <- as.factor(data_processed$excel)


model <- lm(avg_salary ~ Rating + job_simp + job_state + python_yn + R_yn + spark + aws + excel + Industry + Revenue, data = data_processed)


summary(model)

Call:
lm(formula = avg_salary ~ Rating + job_simp + job_state + python_yn + 
    R_yn + spark + aws + excel + Industry + Revenue, data = data_processed)

Residuals:
    Min      1Q  Median      3Q     Max 
-107.62  -14.95    0.00   13.17  133.26 

Coefficients: (1 not defined because of singularities)
                                                 Estimate
(Intercept)                                       98.0694
Rating                                             2.5376
job_simpdata engineer                             33.8054
job_simpdata scientist                            40.9660
job_simpdirector                                  88.5658
job_simpmanager                                    7.0144
job_simpmle                                       55.9473
job_simpna                                        16.8178
job_stateAZ                                       -2.0590
job_stateCA                                       30.1917
job_stateCO                                      -12.9093
job_stateCT                                      -34.6073
job_stateDC                                       16.4993
job_stateDE                                      -27.2004
job_stateFL                                      -12.6928
job_stateGA                                      -16.7533
job_stateIA                                        8.1254
job_stateID                                      -25.2666
job_stateIL                                       13.0808
job_stateIN                                       -2.1185
job_stateKS                                      -75.8209
job_stateKY                                       11.7685
job_stateLA                                      -28.1624
job_stateMA                                        4.3689
job_stateMD                                        6.8498
job_stateMI                                       11.6912
job_stateMN                                        8.1189
job_stateMO                                       12.3346
job_stateNC                                        6.0081
job_stateNE                                      -23.1371
job_stateNJ                                        7.5742
job_stateNM                                      -42.9483
job_stateNY                                        8.3923
job_stateOH                                        3.3154
job_stateOR                                        0.9058
job_statePA                                        8.3758
job_stateRI                                       39.5152
job_stateSC                                       37.8292
job_stateTN                                       -3.7972
job_stateTX                                       -6.4142
job_stateUT                                        6.3697
job_stateVA                                        2.4295
job_stateWA                                       12.7499
job_stateWI                                        1.5595
python_yn1                                         6.2350
R_yn1                                              8.0145
spark1                                            -3.8294
aws1                                               5.4445
excel1                                             0.7815
IndustryAccounting                               -27.3787
IndustryAdvertising & Marketing                  -41.2832
IndustryAerospace & Defense                      -43.9131
IndustryArchitectural & Engineering Services     -81.8475
IndustryAuctions & Galleries                     -27.1965
IndustryBanks & Credit Unions                    -60.5123
IndustryBeauty & Personal Accessories Stores     -39.1759
IndustryBiotech & Pharmaceuticals                -28.3646
IndustryBrokerage Services                         2.7746
IndustryColleges & Universities                  -39.0168
IndustryComputer Hardware & Software             -34.8746
IndustryConstruction                             -91.9622
IndustryConsulting                               -37.4573
IndustryConsumer Product Rental                  -63.1633
IndustryConsumer Products Manufacturing          -38.2034
IndustryDepartment, Clothing, & Shoe Stores      -31.0168
IndustryEducation Training Services              -73.6498
IndustryEnergy                                   -70.7289
IndustryEnterprise Software & Network Solutions  -28.6853
IndustryFarm Support Services                    -58.5843
IndustryFederal Agencies                         -55.4564
IndustryFinancial Analytics & Research           -22.3137
IndustryFinancial Transaction Processing         -54.3022
IndustryFood & Beverage Manufacturing            -63.9363
IndustryGambling                                 -75.1210
IndustryGas Stations                             -46.8705
IndustryHealth Care Products Manufacturing       -39.1578
IndustryHealth Care Services & Hospitals         -54.7651
IndustryHealth, Beauty, & Fitness                -72.5956
IndustryIndustrial Manufacturing                 -73.8454
IndustryInsurance Agencies & Brokerages          -43.0114
IndustryInsurance Carriers                       -41.7075
IndustryInternet                                 -28.0805
IndustryInvestment Banking & Asset Management    -15.8407
IndustryIT Services                              -36.7093
IndustryK-12 Education                           -58.2762
IndustryLending                                  -44.5079
IndustryLogistics & Supply Chain                 -60.2426
IndustryMetals Brokers                           -48.2462
IndustryMining                                   -61.7283
IndustryMotion Picture Production & Distribution   6.5835
IndustryOther Retail Stores                      -19.0592
IndustryReal Estate                              -46.9169
IndustryReligious Organizations                  -46.5904
IndustryResearch & Development                   -58.3445
IndustrySecurity Services                        -26.1471
IndustrySocial Assistance                        -59.5021
IndustrySporting Goods Stores                    -77.7117
IndustryStaffing & Outsourcing                   -64.1122
IndustryStock Exchanges                                NA
IndustryTelecommunications Manufacturing         -40.4989
IndustryTelecommunications Services              -43.6140
IndustryTransportation Equipment Manufacturing   -62.6550
IndustryTransportation Management                -36.4182
IndustryTravel Agencies                          -52.9995
IndustryTrucking                                 -23.3470
IndustryTV Broadcast & Cable Networks            -32.7397
IndustryVideo Games                              -66.2078
IndustryWholesale                                -59.0957
Revenue$1 to $2 billion (USD)                     -1.7843
Revenue$1 to $5 million (USD)                     -8.9855
Revenue$10 to $25 million (USD)                  -16.0173
Revenue$10+ billion (USD)                          2.9453
Revenue$100 to $500 million (USD)                -14.5568
Revenue$2 to $5 billion (USD)                     -9.6651
Revenue$25 to $50 million (USD)                  -21.5341
Revenue$5 to $10 billion (USD)                    -0.5771
Revenue$5 to $10 million (USD)                    27.7521
Revenue$50 to $100 million (USD)                  -9.5054
Revenue$500 million to $1 billion (USD)           -7.6013
RevenueLess than $1 million (USD)                -40.9042
RevenueUnknown / Non-Applicable                   -9.9493
                                                 Std. Error
(Intercept)                                         29.7424
Rating                                               2.0691
job_simpdata engineer                                4.8421
job_simpdata scientist                               4.1544
job_simpdirector                                     9.0189
job_simpmanager                                      7.4187
job_simpmle                                          8.0563
job_simpna                                           4.5546
job_stateAZ                                         14.8357
job_stateCA                                         11.7112
job_stateCO                                         14.1865
job_stateCT                                         17.1992
job_stateDC                                         14.2495
job_stateDE                                         17.0202
job_stateFL                                         13.9595
job_stateGA                                         16.0018
job_stateIA                                         17.7771
job_stateID                                         27.6969
job_stateIL                                         12.7844
job_stateIN                                         14.6888
job_stateKS                                         25.5462
job_stateKY                                         18.2380
job_stateLA                                         18.1018
job_stateMA                                         12.0118
job_stateMD                                         12.0598
job_stateMI                                         17.9264
job_stateMN                                         22.7009
job_stateMO                                         15.3389
job_stateNC                                         13.6688
job_stateNE                                         18.9118
job_stateNJ                                         13.7564
job_stateNM                                         19.5681
job_stateNY                                         12.0465
job_stateOH                                         14.3513
job_stateOR                                         18.6413
job_statePA                                         13.3573
job_stateRI                                         29.8500
job_stateSC                                         32.1668
job_stateTN                                         15.0934
job_stateTX                                         13.0192
job_stateUT                                         20.5097
job_stateVA                                         11.9218
job_stateWA                                         14.5571
job_stateWI                                         15.8276
python_yn1                                           2.7870
R_yn1                                               25.9298
spark1                                               3.2545
aws1                                                 2.9803
excel1                                               2.4449
IndustryAccounting                                  32.9108
IndustryAdvertising & Marketing                     14.6378
IndustryAerospace & Defense                         15.6051
IndustryArchitectural & Engineering Services        22.1421
IndustryAuctions & Galleries                        30.7326
IndustryBanks & Credit Unions                       16.1836
IndustryBeauty & Personal Accessories Stores        29.7001
IndustryBiotech & Pharmaceuticals                   13.3360
IndustryBrokerage Services                          24.1069
IndustryColleges & Universities                     15.1475
IndustryComputer Hardware & Software                15.0430
IndustryConstruction                                19.7900
IndustryConsulting                                  15.3841
IndustryConsumer Product Rental                     21.2158
IndustryConsumer Products Manufacturing             15.4945
IndustryDepartment, Clothing, & Shoe Stores         17.5149
IndustryEducation Training Services                 21.4387
IndustryEnergy                                      18.2461
IndustryEnterprise Software & Network Solutions     14.6995
IndustryFarm Support Services                       32.8583
IndustryFederal Agencies                            16.0560
IndustryFinancial Analytics & Research              17.9261
IndustryFinancial Transaction Processing            20.2709
IndustryFood & Beverage Manufacturing               16.8874
IndustryGambling                                    20.2564
IndustryGas Stations                                21.9788
IndustryHealth Care Products Manufacturing          31.3432
IndustryHealth Care Services & Hospitals            14.1671
IndustryHealth, Beauty, & Fitness                   32.3617
IndustryIndustrial Manufacturing                    19.8789
IndustryInsurance Agencies & Brokerages             17.9046
IndustryInsurance Carriers                          13.8519
IndustryInternet                                    15.5202
IndustryInvestment Banking & Asset Management       19.4566
IndustryIT Services                                 14.2034
IndustryK-12 Education                              20.0433
IndustryLending                                     17.8319
IndustryLogistics & Supply Chain                    21.1218
IndustryMetals Brokers                              25.6368
IndustryMining                                      31.2623
IndustryMotion Picture Production & Distribution    31.0956
IndustryOther Retail Stores                         30.9008
IndustryReal Estate                                 17.8614
IndustryReligious Organizations                     23.9757
IndustryResearch & Development                      15.2970
IndustrySecurity Services                           19.3169
IndustrySocial Assistance                           21.7173
IndustrySporting Goods Stores                       25.8427
IndustryStaffing & Outsourcing                      19.2225
IndustryStock Exchanges                                  NA
IndustryTelecommunications Manufacturing            25.5230
IndustryTelecommunications Services                 20.9031
IndustryTransportation Equipment Manufacturing      34.0424
IndustryTransportation Management                   22.0203
IndustryTravel Agencies                             18.0637
IndustryTrucking                                    31.8976
IndustryTV Broadcast & Cable Networks               24.2450
IndustryVideo Games                                 23.1375
IndustryWholesale                                   21.4668
Revenue$1 to $2 billion (USD)                       28.8346
Revenue$1 to $5 million (USD)                       31.0939
Revenue$10 to $25 million (USD)                     29.0204
Revenue$10+ billion (USD)                           28.7066
Revenue$100 to $500 million (USD)                   28.7133
Revenue$2 to $5 billion (USD)                       29.1895
Revenue$25 to $50 million (USD)                     28.9459
Revenue$5 to $10 billion (USD)                      29.5280
Revenue$5 to $10 million (USD)                      29.6584
Revenue$50 to $100 million (USD)                    28.9581
Revenue$500 million to $1 billion (USD)             29.0157
RevenueLess than $1 million (USD)                   32.1823
RevenueUnknown / Non-Applicable                     28.4257
                                                 t value
(Intercept)                                        3.297
Rating                                             1.226
job_simpdata engineer                              6.982
job_simpdata scientist                             9.861
job_simpdirector                                   9.820
job_simpmanager                                    0.946
job_simpmle                                        6.945
job_simpna                                         3.692
job_stateAZ                                       -0.139
job_stateCA                                        2.578
job_stateCO                                       -0.910
job_stateCT                                       -2.012
job_stateDC                                        1.158
job_stateDE                                       -1.598
job_stateFL                                       -0.909
job_stateGA                                       -1.047
job_stateIA                                        0.457
job_stateID                                       -0.912
job_stateIL                                        1.023
job_stateIN                                       -0.144
job_stateKS                                       -2.968
job_stateKY                                        0.645
job_stateLA                                       -1.556
job_stateMA                                        0.364
job_stateMD                                        0.568
job_stateMI                                        0.652
job_stateMN                                        0.358
job_stateMO                                        0.804
job_stateNC                                        0.440
job_stateNE                                       -1.223
job_stateNJ                                        0.551
job_stateNM                                       -2.195
job_stateNY                                        0.697
job_stateOH                                        0.231
job_stateOR                                        0.049
job_statePA                                        0.627
job_stateRI                                        1.324
job_stateSC                                        1.176
job_stateTN                                       -0.252
job_stateTX                                       -0.493
job_stateUT                                        0.311
job_stateVA                                        0.204
job_stateWA                                        0.876
job_stateWI                                        0.099
python_yn1                                         2.237
R_yn1                                              0.309
spark1                                            -1.177
aws1                                               1.827
excel1                                             0.320
IndustryAccounting                                -0.832
IndustryAdvertising & Marketing                   -2.820
IndustryAerospace & Defense                       -2.814
IndustryArchitectural & Engineering Services      -3.696
IndustryAuctions & Galleries                      -0.885
IndustryBanks & Credit Unions                     -3.739
IndustryBeauty & Personal Accessories Stores      -1.319
IndustryBiotech & Pharmaceuticals                 -2.127
IndustryBrokerage Services                         0.115
IndustryColleges & Universities                   -2.576
IndustryComputer Hardware & Software              -2.318
IndustryConstruction                              -4.647
IndustryConsulting                                -2.435
IndustryConsumer Product Rental                   -2.977
IndustryConsumer Products Manufacturing           -2.466
IndustryDepartment, Clothing, & Shoe Stores       -1.771
IndustryEducation Training Services               -3.435
IndustryEnergy                                    -3.876
IndustryEnterprise Software & Network Solutions   -1.951
IndustryFarm Support Services                     -1.783
IndustryFederal Agencies                          -3.454
IndustryFinancial Analytics & Research            -1.245
IndustryFinancial Transaction Processing          -2.679
IndustryFood & Beverage Manufacturing             -3.786
IndustryGambling                                  -3.709
IndustryGas Stations                              -2.133
IndustryHealth Care Products Manufacturing        -1.249
IndustryHealth Care Services & Hospitals          -3.866
IndustryHealth, Beauty, & Fitness                 -2.243
IndustryIndustrial Manufacturing                  -3.715
IndustryInsurance Agencies & Brokerages           -2.402
IndustryInsurance Carriers                        -3.011
IndustryInternet                                  -1.809
IndustryInvestment Banking & Asset Management     -0.814
IndustryIT Services                               -2.585
IndustryK-12 Education                            -2.908
IndustryLending                                   -2.496
IndustryLogistics & Supply Chain                  -2.852
IndustryMetals Brokers                            -1.882
IndustryMining                                    -1.975
IndustryMotion Picture Production & Distribution   0.212
IndustryOther Retail Stores                       -0.617
IndustryReal Estate                               -2.627
IndustryReligious Organizations                   -1.943
IndustryResearch & Development                    -3.814
IndustrySecurity Services                         -1.354
IndustrySocial Assistance                         -2.740
IndustrySporting Goods Stores                     -3.007
IndustryStaffing & Outsourcing                    -3.335
IndustryStock Exchanges                               NA
IndustryTelecommunications Manufacturing          -1.587
IndustryTelecommunications Services               -2.086
IndustryTransportation Equipment Manufacturing    -1.840
IndustryTransportation Management                 -1.654
IndustryTravel Agencies                           -2.934
IndustryTrucking                                  -0.732
IndustryTV Broadcast & Cable Networks             -1.350
IndustryVideo Games                               -2.861
IndustryWholesale                                 -2.753
Revenue$1 to $2 billion (USD)                     -0.062
Revenue$1 to $5 million (USD)                     -0.289
Revenue$10 to $25 million (USD)                   -0.552
Revenue$10+ billion (USD)                          0.103
Revenue$100 to $500 million (USD)                 -0.507
Revenue$2 to $5 billion (USD)                     -0.331
Revenue$25 to $50 million (USD)                   -0.744
Revenue$5 to $10 billion (USD)                    -0.020
Revenue$5 to $10 million (USD)                     0.936
Revenue$50 to $100 million (USD)                  -0.328
Revenue$500 million to $1 billion (USD)           -0.262
RevenueLess than $1 million (USD)                 -1.271
RevenueUnknown / Non-Applicable                   -0.350
                                                 Pr(>|t|)
(Intercept)                                      0.001032
Rating                                           0.220516
job_simpdata engineer                            7.51e-12
job_simpdata scientist                            < 2e-16
job_simpdirector                                  < 2e-16
job_simpmanager                                  0.344770
job_simpmle                                      9.59e-12
job_simpna                                       0.000242
job_stateAZ                                      0.889664
job_stateCA                                      0.010166
job_stateCO                                      0.363191
job_stateCT                                      0.044635
job_stateDC                                      0.247356
job_stateDE                                      0.110523
job_stateFL                                      0.363566
job_stateGA                                      0.295524
job_stateIA                                      0.647779
job_stateID                                      0.361990
job_stateIL                                      0.306617
job_stateIN                                      0.885371
job_stateKS                                      0.003113
job_stateKY                                      0.518987
job_stateLA                                      0.120269
job_stateMA                                      0.716192
job_stateMD                                      0.570246
job_stateMI                                      0.514528
job_stateMN                                      0.720728
job_stateMO                                      0.421624
job_stateNC                                      0.660418
job_stateNE                                      0.221636
job_stateNJ                                      0.582109
job_stateNM                                      0.028546
job_stateNY                                      0.486279
job_stateOH                                      0.817376
job_stateOR                                      0.961262
job_statePA                                      0.530851
job_stateRI                                      0.186058
job_stateSC                                      0.240032
job_stateTN                                      0.801447
job_stateTX                                      0.622418
job_stateUT                                      0.756233
job_stateVA                                      0.838590
job_stateWA                                      0.381448
job_stateWI                                      0.921542
python_yn1                                       0.025629
R_yn1                                            0.757360
spark1                                           0.239795
aws1                                             0.068208
excel1                                           0.749334
IndustryAccounting                               0.405781
IndustryAdvertising & Marketing                  0.004951
IndustryAerospace & Defense                      0.005047
IndustryArchitectural & Engineering Services     0.000238
IndustryAuctions & Galleries                     0.376530
IndustryBanks & Credit Unions                    0.000202
IndustryBeauty & Personal Accessories Stores     0.187638
IndustryBiotech & Pharmaceuticals                0.033819
IndustryBrokerage Services                       0.908406
IndustryColleges & Universities                  0.010231
IndustryComputer Hardware & Software             0.020754
IndustryConstruction                             4.12e-06
IndustryConsulting                               0.015180
IndustryConsumer Product Rental                  0.003022
IndustryConsumer Products Manufacturing          0.013947
IndustryDepartment, Clothing, & Shoe Stores      0.077070
IndustryEducation Training Services              0.000631
IndustryEnergy                                   0.000117
IndustryEnterprise Software & Network Solutions  0.051453
IndustryFarm Support Services                    0.075084
IndustryFederal Agencies                         0.000590
IndustryFinancial Analytics & Research           0.213688
IndustryFinancial Transaction Processing         0.007583
IndustryFood & Beverage Manufacturing            0.000168
IndustryGambling                                 0.000227
IndustryGas Stations                             0.033354
IndustryHealth Care Products Manufacturing       0.212017
IndustryHealth Care Services & Hospitals         0.000122
IndustryHealth, Beauty, & Fitness                0.025232
IndustryIndustrial Manufacturing                 0.000222
IndustryInsurance Agencies & Brokerages          0.016586
IndustryInsurance Carriers                       0.002710
IndustryInternet                                 0.070889
IndustryInvestment Banking & Asset Management    0.415867
IndustryIT Services                              0.009978
IndustryK-12 Education                           0.003773
IndustryLending                                  0.012819
IndustryLogistics & Supply Chain                 0.004487
IndustryMetals Brokers                           0.060314
IndustryMining                                   0.048764
IndustryMotion Picture Production & Distribution 0.832395
IndustryOther Retail Stores                      0.537602
IndustryReal Estate                              0.008834
IndustryReligious Organizations                  0.052439
IndustryResearch & Development                   0.000150
IndustrySecurity Services                        0.176360
IndustrySocial Assistance                        0.006323
IndustrySporting Goods Stores                    0.002744
IndustryStaffing & Outsourcing                   0.000903
IndustryStock Exchanges                                NA
IndustryTelecommunications Manufacturing         0.113075
IndustryTelecommunications Services              0.037342
IndustryTransportation Equipment Manufacturing   0.066172
IndustryTransportation Management                0.098663
IndustryTravel Agencies                          0.003469
IndustryTrucking                                 0.464484
IndustryTV Broadcast & Cable Networks            0.177390
IndustryVideo Games                              0.004358
IndustryWholesale                                0.006080
Revenue$1 to $2 billion (USD)                    0.950678
Revenue$1 to $5 million (USD)                    0.772694
Revenue$10 to $25 million (USD)                  0.581193
Revenue$10+ billion (USD)                        0.918313
Revenue$100 to $500 million (USD)                0.612355
Revenue$2 to $5 billion (USD)                    0.740668
Revenue$25 to $50 million (USD)                  0.457192
Revenue$5 to $10 billion (USD)                   0.984412
Revenue$5 to $10 million (USD)                   0.349778
Revenue$50 to $100 million (USD)                 0.742836
Revenue$500 million to $1 billion (USD)          0.793431
RevenueLess than $1 million (USD)                0.204199
RevenueUnknown / Non-Applicable                  0.726449
                                                    
(Intercept)                                      ** 
Rating                                              
job_simpdata engineer                            ***
job_simpdata scientist                           ***
job_simpdirector                                 ***
job_simpmanager                                     
job_simpmle                                      ***
job_simpna                                       ***
job_stateAZ                                         
job_stateCA                                      *  
job_stateCO                                         
job_stateCT                                      *  
job_stateDC                                         
job_stateDE                                         
job_stateFL                                         
job_stateGA                                         
job_stateIA                                         
job_stateID                                         
job_stateIL                                         
job_stateIN                                         
job_stateKS                                      ** 
job_stateKY                                         
job_stateLA                                         
job_stateMA                                         
job_stateMD                                         
job_stateMI                                         
job_stateMN                                         
job_stateMO                                         
job_stateNC                                         
job_stateNE                                         
job_stateNJ                                         
job_stateNM                                      *  
job_stateNY                                         
job_stateOH                                         
job_stateOR                                         
job_statePA                                         
job_stateRI                                         
job_stateSC                                         
job_stateTN                                         
job_stateTX                                         
job_stateUT                                         
job_stateVA                                         
job_stateWA                                         
job_stateWI                                         
python_yn1                                       *  
R_yn1                                               
spark1                                              
aws1                                             .  
excel1                                              
IndustryAccounting                                  
IndustryAdvertising & Marketing                  ** 
IndustryAerospace & Defense                      ** 
IndustryArchitectural & Engineering Services     ***
IndustryAuctions & Galleries                        
IndustryBanks & Credit Unions                    ***
IndustryBeauty & Personal Accessories Stores        
IndustryBiotech & Pharmaceuticals                *  
IndustryBrokerage Services                          
IndustryColleges & Universities                  *  
IndustryComputer Hardware & Software             *  
IndustryConstruction                             ***
IndustryConsulting                               *  
IndustryConsumer Product Rental                  ** 
IndustryConsumer Products Manufacturing          *  
IndustryDepartment, Clothing, & Shoe Stores      .  
IndustryEducation Training Services              ***
IndustryEnergy                                   ***
IndustryEnterprise Software & Network Solutions  .  
IndustryFarm Support Services                    .  
IndustryFederal Agencies                         ***
IndustryFinancial Analytics & Research              
IndustryFinancial Transaction Processing         ** 
IndustryFood & Beverage Manufacturing            ***
IndustryGambling                                 ***
IndustryGas Stations                             *  
IndustryHealth Care Products Manufacturing          
IndustryHealth Care Services & Hospitals         ***
IndustryHealth, Beauty, & Fitness                *  
IndustryIndustrial Manufacturing                 ***
IndustryInsurance Agencies & Brokerages          *  
IndustryInsurance Carriers                       ** 
IndustryInternet                                 .  
IndustryInvestment Banking & Asset Management       
IndustryIT Services                              ** 
IndustryK-12 Education                           ** 
IndustryLending                                  *  
IndustryLogistics & Supply Chain                 ** 
IndustryMetals Brokers                           .  
IndustryMining                                   *  
IndustryMotion Picture Production & Distribution    
IndustryOther Retail Stores                         
IndustryReal Estate                              ** 
IndustryReligious Organizations                  .  
IndustryResearch & Development                   ***
IndustrySecurity Services                           
IndustrySocial Assistance                        ** 
IndustrySporting Goods Stores                    ** 
IndustryStaffing & Outsourcing                   ***
IndustryStock Exchanges                             
IndustryTelecommunications Manufacturing            
IndustryTelecommunications Services              *  
IndustryTransportation Equipment Manufacturing   .  
IndustryTransportation Management                .  
IndustryTravel Agencies                          ** 
IndustryTrucking                                    
IndustryTV Broadcast & Cable Networks               
IndustryVideo Games                              ** 
IndustryWholesale                                ** 
Revenue$1 to $2 billion (USD)                       
Revenue$1 to $5 million (USD)                       
Revenue$10 to $25 million (USD)                     
Revenue$10+ billion (USD)                           
Revenue$100 to $500 million (USD)                   
Revenue$2 to $5 billion (USD)                       
Revenue$25 to $50 million (USD)                     
Revenue$5 to $10 billion (USD)                      
Revenue$5 to $10 million (USD)                      
Revenue$50 to $100 million (USD)                    
Revenue$500 million to $1 billion (USD)             
RevenueLess than $1 million (USD)                   
RevenueUnknown / Non-Applicable                     
---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 26.86 on 622 degrees of freedom
Multiple R-squared:  0.599, Adjusted R-squared:  0.5223 
F-statistic: 7.807 on 119 and 622 DF,  p-value: < 2.2e-16
# Residuals vs Fitted
ggplot(model, aes(.fitted, .resid)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  labs(title = "Residuals vs Fitted", x = "Fitted values", y = "Residuals") +
  theme_minimal()
`geom_smooth()` using method = 'loess' and formula = 'y ~
x'

# Normal Q-Q
ggplot(model, aes(sample = .stdresid)) +
  stat_qq() +
  stat_qq_line() +
  labs(title = "Normal Q-Q", x = "Theoretical Quantiles", y = "Standardized Residuals") +
  theme_minimal()
Warning: Removed 13 rows containing non-finite outside the scale
range (`stat_qq()`).
Warning: Removed 13 rows containing non-finite outside the scale
range (`stat_qq_line()`).

# Scale-Location (or Spread-Location)
ggplot(model, aes(.fitted, sqrt(abs(.stdresid)))) +
  geom_point() +
  geom_smooth(se = FALSE) +
  labs(title = "Scale-Location", x = "Fitted values", y = "Square Root of Standardized Residuals") +
  theme_minimal()
`geom_smooth()` using method = 'loess' and formula = 'y ~
x'
Warning: Removed 13 rows containing non-finite outside the scale
range (`stat_smooth()`).
Warning: Removed 13 rows containing missing values or values outside
the scale range (`geom_point()`).

# Residuals vs Leverage
ggplot(model, aes(.hat, .stdresid)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  labs(title = "Residuals vs Leverage", x = "Leverage", y = "Standardized Residuals") +
  theme_minimal()
`geom_smooth()` using method = 'loess' and formula = 'y ~
x'
Warning: Removed 13 rows containing non-finite outside the scale
range (`stat_smooth()`).
Warning: Removed 13 rows containing missing values or values outside
the scale range (`geom_point()`).

# rating vs average salary
ggplot(data_processed, aes(x = Rating, y = avg_salary)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Effect of Rating on Avg Salary", x = "Rating", y = "Average Salary") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

# job title vs salary
ggplot(data_processed, aes(x = job_simp, y = avg_salary)) +
  geom_boxplot() +
  labs(title = "Effect of Job Title on Salary", x = "Job Title", y = "Average Salary") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


# job state vs salary 
ggplot(data_processed, aes(x = job_state, y = avg_salary)) +
  geom_boxplot() +
  labs(title = "Effect of Job State on Avg Salary", x = "Job State", y = "Average Salary") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

NA
NA
NA
# Create a regression plot with skill count
ggplot(filtered_data, aes(x = skill_count, y = avg_salary)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Number of Skills vs Avg Salary", x = "Number of Skills", y = "Average Salary") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

table(data_3$seniority)

    jr     na senior 
     2    520    220 
#group by industry and senioity to find average salary for each senioority level
industry_salary <- data_3 %>%
  group_by(Industry, seniority) %>%
  summarize(avg_salary = mean(avg_salary, na.rm = TRUE)) %>% ungroup()
`summarise()` has grouped output by 'Industry'. You can
override using the `.groups` argument.
# top 10 highest salaries for senior positions
top_senior <- industry_salary %>%
  filter(seniority == "senior") %>%
  arrange(desc(avg_salary)) %>% slice_head(n=10)
# top 10 salaries for jr or non specified positions
top_non_senior <- industry_salary %>%
  filter(seniority == "jr" | seniority == "na" ) %>%
  arrange(desc(avg_salary)) %>% slice_head(n=10)


industry_salary %>%  slice(3:18) %>%
ggplot(aes(x = Industry, y = avg_salary, fill = seniority)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Salary by Industry and Seniority", x = "Industry", y = "Average Salary") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# there is one entry in column job_state that has Los Angeles instead of CA,so we need to fix that
data_1 <- data_1 %>%
  mutate(job_state = ifelse(job_state == " Los Angeles" | job_state == "CA", " CA", job_state)) 

# check
unique(data_1$job_state)
 [1] " NM" " MD" " FL" " WA" " NY" " TX" " CA" " VA" " MA"
[10] " NJ" " CO" " IL" " KY" " OR" " CT" " MI" " DC" " OH"
[19] " AL" " MO" " PA" " GA" " IN" " LA" " WI" " NC" " AZ"
[28] " NE" " MN" " UT" " TN" " DE" " ID" " RI" " IA" " SC"
[37] " KS"
highest_salary <- data_1 %>%
  arrange(desc(avg_salary))
# plot of highest salaries per State
highest_salary %>% 
  ggplot(aes(x = job_state, y = max_salary)) +
  geom_bar(stat = "identity", fill = "orange")  +
  labs(title = "Highest Salaries by State", x = "State", y = "Salary") +
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5))


# plot of highest salaries per City
highest_salary %>% 
  slice_head(n=30) %>%
  ggplot( aes(x = Location, y = max_salary)) +
  geom_bar(stat = "identity", fill = "purple") +
  labs(title = "Highest Salaries by City", x = "State", y = "Salary") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

table(data_3$Type.of.ownership)

                            -1 
                             1 
          College / University 
                            13 
             Company - Private 
                           410 
              Company - Public 
                           193 
                    Government 
                            15 
                      Hospital 
                            15 
        Nonprofit Organization 
                            55 
            Other Organization 
                             3 
      School / School District 
                             2 
Subsidiary or Business Segment 
                            34 
                       Unknown 
                             1 
filtered_data <- data_3 %>%
  filter(Type.of.ownership %in% c("Company - Private", "Company - Public","Government", "Nonprofit Organization")) %>%
  group_by(Type.of.ownership) %>%
  summarize(avg_salary = mean(avg_salary))

filtered_data %>%
ggplot(aes(x = Type.of.ownership, y = avg_salary, fill = Type.of.ownership)) +
  geom_bar(stat = "identity") +
  labs(title = "Salary by Ownership Type", x = "Type of Ownership", y = "Average Salary") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

data_1 %>%
  group_by(job_state) %>%
  summarize(avg_salary = mean(avg_salary)) %>%
  arrange(desc(avg_salary)) %>%
  slice_head(n=20) %>%
  ggplot(aes(x = reorder(job_state, avg_salary), y = avg_salary, fill = job_state)) +
  geom_bar(stat = "identity") +
  labs(title = "Average Salary by State", x = "State", y = "Average Salary") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

NA
NA
# remove states with 1 entry of with repeating entries
state_data <- data_1 %>%
    filter(!(job_state %in% c(" KS", " DE", " SC", " RI"))) 
# function that creates violin plots for each state
state_salary <- function(state_name) {
  state_data <- data_1 %>%
    filter(job_state == state_name) 

   ggplot(state_data, aes(x = job_state, y = avg_salary, fill = job_state)) +
    geom_violin() +
    labs(title = paste("Salary Range in", state_name), x = "State", y = "Salary") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

}
# separate plots for each state
state_plots <- lapply(unique(state_data$job_state), state_salary)
state_plots
[[1]]

[[2]]

[[3]]

[[4]]

[[5]]

[[6]]

[[7]]

[[8]]

[[9]]

[[10]]

[[11]]

[[12]]

[[13]]

[[14]]

[[15]]

[[16]]

[[17]]

[[18]]

[[19]]

[[20]]

[[21]]

[[22]]

[[23]]

[[24]]

[[25]]

[[26]]

[[27]]

[[28]]

[[29]]

[[30]]

[[31]]

[[32]]

[[33]]

# check why certain plots didn't work 
wrong_entries <- data_1 %>%
  filter(job_state == " KS" | job_state ==" DE" |job_state == " SC" |job_state == " RI")
print(wrong_entries)
NA
# check all the unique values for Revenue 
# table(data_3$Revenue)

# revenue categories with the most entries 
revenue_categories <- c(
  "$50 to $100 million (USD)",
  "$100 to $500 million (USD)",
  "$500 million to $1 billion (USD)",
  "$1 to $2 billion (USD)",
  "$10+ billion (USD)"
)

# filter data 
data_3 %>% filter(Revenue %in% revenue_categories) %>%
  ggplot(aes(x = Revenue, y = avg_salary, fill = Revenue)) +
  geom_boxplot() +
  labs(
    title = "Salary Distribution by Company Revenue",
    x = "Company Revenue",
    y = "Average Salary"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
         plot.title = element_text(size = 12, face = "bold", vjust = 2),
        plot.margin = margin(5,0,0,10),
        legend.text = element_text(size = 9)
      )

NA
NA
NA
unique(data_4$work_year)
[1] 2024 2022 2023 2020 2021
table(data_1$Size)

                     -1       1 to 50 employees 
                      1                      31 
       10000+ employees  1001 to 5000 employees 
                    130                     150 
   201 to 500 employees 5001 to 10000 employees 
                    117                      76 
  501 to 1000 employees     51 to 200 employees 
                    134                      94 
                Unknown 
                      9 
table(data_4$work_year)

2020 2021 2022 2023 2024 
  75  218 1655 8519 6027 
table(data_3$Revenue)

                              -1 
                               1 
          $1 to $2 billion (USD) 
                              60 
          $1 to $5 million (USD) 
                               8 
        $10 to $25 million (USD) 
                              32 
              $10+ billion (USD) 
                             124 
      $100 to $500 million (USD) 
                              91 
          $2 to $5 billion (USD) 
                              39 
        $25 to $50 million (USD) 
                              40 
         $5 to $10 billion (USD) 
                              19 
         $5 to $10 million (USD) 
                              18 
       $50 to $100 million (USD) 
                              46 
$500 million to $1 billion (USD) 
                              57 
      Less than $1 million (USD) 
                               4 
        Unknown / Non-Applicable 
                             203 
# find all the unique job titles from data_3
unique(data_3$job_simp)
[1] "data scientist" "na"            
[3] "analyst"        "data engineer" 
[5] "director"       "manager"       
[7] "mle"           
# see how many of each unique job title there are 
table(data_3$job_simp)

       analyst  data engineer data scientist 
           102            119            279 
      director        manager            mle 
            14             22             22 
            na 
           184 
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCmBgYHtyfQojIGxpYnJhcnkoc2hpbnkpCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkoZ2dwbG90MikKbGlicmFyeSh0aWR5cikKbGlicmFyeShyZXNoYXBlMikgIyByZXNoYXBpbmcgdGhlIGRhdGEgCmxpYnJhcnkoY29ycnBsb3QpICMgY29ycmVsYXRpb24gbWF0cml4IApsaWJyYXJ5KGNhcikgIyBhcHBsaWVkIHJlZ3Jlc3Npb24gCgpkYXRhXzEgPC0gcmVhZC5jc3YoZmlsZSA9ICIuL3NhbGFyeV9kYXRhX2NsZWFuZWQuY3N2IikgIyAKZGF0YV8yIDwtIHJlYWQuY3N2KGZpbGUgPSAiLi9nbGFzc2Rvb3Jfam9icy5jc3YiKSAjIApkYXRhXzMgPC0gcmVhZC5jc3YoZmlsZSA9ICIuL2VkYV9kYXRhLmNzdiIpICMgIGRhdGEgc2NpZW5jZSBqb2IgcG9zdGluZ3MgZnJvbSBHbGFzc2Rvb3IuY29tIGZvciAyMDE3LTIwMTggMzMgdmFyaWFibGVzIApkYXRhXzQgPC0gcmVhZC5jc3YoZmlsZSA9ICIuL3NhbGFyaWVzXzIuY3N2IikgICMgZGF0YSBzY2llbnRpc3Qgc2FsYXJpZXMgZm9yIDIwMjQKCmBgYAoKCmBgYHtyfQpoZWFkKGRhdGFfMSwgMTApCmBgYAoKCmBgYHtyfQpoZWFkKGRhdGFfMiwgNSkKYGBgCgoKYGBge3J9CmhlYWQoZGF0YV8zLCA1KQpgYGAKCgpgYGB7cn0KaGVhZChkYXRhXzQsIDUpCmBgYApgYGB7cn0KIyBjaGVja2luZyBmb3IgbWlzc2luZyB2YWx1ZXMgaW4gZWFjaCBkYXRhZnJhbWUgCnN1bShpcy5uYShkYXRhXzEpKQpzdW0oaXMubmEoZGF0YV8yKSkKc3VtKGlzLm5hKGRhdGFfMykpCnN1bShpcy5uYShkYXRhXzQpKQpgYGAKCgpgYGB7cn0KIyBjaGVjayBpZiB0aGVyZSBhcmUgZW1wdHkgc3RyaW5ncyBhbmQgcmVwbGFjZSB0aGVtIHdpdGggTkEKIyBkcGx5cjo6bXV0YXRlX2FsbChkYXRhXzEsIGxpc3Qofm5hX2lmKC4sIiIpKSkKZGF0YV8xICU+JSBkcGx5cjo6bXV0YXRlX2lmKGlzLmNoYXJhY3RlciwgbGlzdCh+bmFfaWYoLiwiIikpKSAKZGF0YV8yICU+JSBkcGx5cjo6bXV0YXRlX2lmKGlzLmNoYXJhY3RlciwgbGlzdCh+bmFfaWYoLiwiIikpKSAKZGF0YV8zICU+JSBkcGx5cjo6bXV0YXRlX2lmKGlzLmNoYXJhY3RlciwgbGlzdCh+bmFfaWYoLiwiIikpKSAKZGF0YV80ICU+JSBkcGx5cjo6bXV0YXRlX2lmKGlzLmNoYXJhY3RlciwgbGlzdCh+bmFfaWYoLiwiIikpKSAKYGBgCmBgYHtyfQpzdW0oaXMubmEoZGF0YV8xKSkKc3VtKGlzLm5hKGRhdGFfMikpCnN1bShpcy5uYShkYXRhXzMpKQpzdW0oaXMubmEoZGF0YV80KSkKYGBgCgoKYGBge3J9CnN1bW1hcnkuZGF0YS5mcmFtZShkYXRhXzEpCgpgYGAKYGBge3J9CnN1bW1hcnkuZGF0YS5mcmFtZShkYXRhXzIpCmBgYApgYGB7cn0Kc3VtbWFyeS5kYXRhLmZyYW1lKGRhdGFfMykKYGBgCmBgYHtyfQpzdW1tYXJ5LmRhdGEuZnJhbWUoZGF0YV80KQpgYGAKYGBge3J9CiMgZHJvcHBpbmcgY29sdW1uIFggZnJvbSBkYXRhXzMKZGF0YV8zIDwtIGRhdGFfMyAlPiUgc2VsZWN0KC1YKQpzdW1tYXJ5LmRhdGEuZnJhbWUoZGF0YV8zKQpgYGAKYGBge3J9CiMgaG93IG1hbnkgZGlmZmVyZW50IGNvbXBhbmllcyBhcmUgaW4gZGF0YV8xCmxlbmd0aCh1bmlxdWUoZGF0YV8xJGNvbXBhbnlfdHh0KSkKIyBob3cgbWFueSBkaWZmZXJlbnQgaW5kdXN0aWVzIGluIGRhdGFfMQpsZW5ndGgodW5pcXVlKGRhdGFfMSRJbmR1c3RyeSkpCiMgaG93IG1hbnkgZGlmZmVyZW50IHNlY3RvcnMgaW4gZGF0YV8xCmxlbmd0aCh1bmlxdWUoZGF0YV8xJFNlY3RvcikpCiMgaG93IG1hbnkgZGlmZmVyZW50IGpvYiB0aXRlbHMgaW4gZGF0YV8xCmxlbmd0aCh1bmlxdWUoZGF0YV8xJEpvYi5UaXRsZSkpCiMgaG93IG1hbnkgZGlmZmVyZW50IHN0YXRlcyBpbiBkYXRhXzEKbGVuZ3RoKHVuaXF1ZShkYXRhXzEkam9iX3N0YXRlKSkKYGBgCmBgYHtyfQojIGhlYXQgbWFwIHRvIHNlZSB0aGUgY29ycmVsYXRpb24gYmV0d2VlbiB2YXJpYWJsZXMgCiMgY3JlYXRlIGEgc3Vic2V0IGZvciBkYXRhXzEgd2l0aCBudW1lcmljIHZhcmlhYmxlcyBvbmx5CgpkYXRhX251bWVyaWMgPC0gc2VsZWN0X2lmKGRhdGFfMSwgaXMubnVtZXJpYykgCgpkYXRhX251bWVyaWMgPC0gZGF0YV9udW1lcmljWywgIW5hbWVzKGRhdGFfbnVtZXJpYykgJWluJSBjKCJGb3VuZGVkIiwiaG91cmx5IiwgImVtcGxveWVyX3Byb3ZpZGVkIiwgInNhbWVfc3RhdGUiLCAiYWdlIildCgojIGRhdGFfbWVsdGVkIDwtIG1lbHQoZGF0YV9udW1lcmljKQoKIyBjb3JyZWxhdGlvbiBtYXRyaXggCmNvcl9tYXRyaXggPC0gY29yKGRhdGFfbnVtZXJpYykKY29yX21lbHRlZCA8LSBtZWx0KGNvcl9tYXRyaXgsIHZhcm5hbWVzID0gYygiVmFyMSIsICJWYXIyIiksIHZhbHVlLm5hbWUgPSAiQ29ycmVsYXRpb24iKQoKZ2dwbG90KGRhdGEgPSBjb3JfbWVsdGVkLCBhZXMoeCA9IFZhcjEsIHkgPSBWYXIyLCBmaWxsID0gQ29ycmVsYXRpb24pKSArCiAgZ2VvbV90aWxlKCkgKwogIGxhYnModGl0bGUgPSAiQ29ycmVsYXRpb24gSGVhdG1hcCIsCiAgICAgICB4ID0gIlZhcmlhYmxlIiwKICAgICAgIHkgPSAiVmFyaWFibGUiKSArCiAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoYW5nbGUgPSA0NSwgdmp1c3QgPSAxLCBzaXplID0gMTIsIGhqdXN0ID0gMSkpICsKICBzY2FsZV9maWxsX2dyYWRpZW50Mihsb3cgPSAiYmx1ZSIsIGhpZ2ggPSAicmVkIiwgbWlkID0gIndoaXRlIiwgbWlkcG9pbnQgPSAwLCBsaW1pdCA9IGMoLTEsIDEpLCBzcGFjZSA9ICJMYWIiLCAKICAgICAgICAgICAgICAgICAgICAgICBuYW1lID0gIkNvcnJlbGF0aW9uIikgKwogICBnZW9tX3RleHQoYWVzKGxhYmVsID0gcm91bmQoQ29ycmVsYXRpb24sIDIpKSwgY29sb3IgPSAiYmxhY2siLCBzaXplID0gMikKYGBgCmBgYHtyfQpweXRob25fam9icyA8LSBkYXRhXzFbZGF0YV8xJHB5dGhvbl95biA9PTEsIF0KcmF0ZV9weXRob24gPSAoKG5yb3cocHl0aG9uX2pvYnMpKS9ucm93KGRhdGFfMSkpKjEwMAoKcl9qb2JzIDwtIGRhdGFfMVtkYXRhXzEkUl95biA9PTEsIF0KcmF0ZV9yID0gKChucm93KHJfam9icykpL25yb3coZGF0YV8xKSkqMTAwCgpzcGFya19qb2JzIDwtIGRhdGFfMVtkYXRhXzEkc3BhcmsgPT0xLCBdCnJhdGVfc3BhcmsgPSAoKG5yb3coc3Bhcmtfam9icykpL25yb3coZGF0YV8xKSkqMTAwCgphd3Nfam9icyA8LSBkYXRhXzFbZGF0YV8xJGF3cyA9PTEsIF0KcmF0ZV9hd3MgPSAoKG5yb3coYXdzX2pvYnMpKS9ucm93KGRhdGFfMSkpKjEwMAoKZXhjZWxfam9icyA8LSBkYXRhXzFbZGF0YV8xJGV4Y2VsID09IDEsIF0KcmF0ZV9leGNlbCA9ICgobnJvdyhleGNlbF9qb2JzKSkvbnJvdyhkYXRhXzEpKSoxMDAKCnNraWxsc19kYXRhIDwtIGRhdGEuZnJhbWUoCiAgc2tpbGwgPSBjKCJQeXRob24iLCAiUiIsICJTcGFyayIsICJBV1MiLCAiRXhjZWwiKSwKICByYXRlID0gYyhyYXRlX3B5dGhvbiwgcmF0ZV9yLCByYXRlX3NwYXJrLCByYXRlX2F3cywgcmF0ZV9leGNlbCkKKQpnZ3Bsb3Qoc2tpbGxzX2RhdGEsIGFlcyh4ID0gc2tpbGwsIHkgPSByYXRlLCBmaWxsID0gc2tpbGwpKSArCiAgZ2VvbV9iYXIoc3RhdCA9ICJpZGVudGl0eSIpKwogIHRoZW1lX2xpZ2h0KCkrCiAgbGFicyh0aXRsZSA9ICJTa2lsbHMgUmVxdWlyZWQiLCB4ID0gIlNraWxsIiwgeSA9ICJSYXRlICglKSIpICsKICBnZW9tX3RleHQoYWVzKGxhYmVsID0gcGFzdGUwKHJvdW5kKHJhdGUsIDEpLCAiJSIpKSwgCiAgICAgICAgICAgICBwb3NpdGlvbiA9IHBvc2l0aW9uX3N0YWNrKHZqdXN0ID0gMC41KSxzaXplPTMpCgpgYGAKYGBge3J9CiMgbXVsdGlwbGUgc2tpbGxzIHZzIHNhbGFyeSAKIyAyIHNraWxscyB2cyAzIHNraWxscyB2cyA0IHNraWxscwojIGhvdyBtYW55IHNraWxscyBpbiBlYWNoIHJvdyBhcmUgdHJ1ZSwgY3JlYXRlIG5ldyBjb2x1bW4gCmZpbHRlcl9za2lsbHMgPC0gZnVuY3Rpb24oZGF0YSwgc2tpbGxfY291bnQpIHsKICBkYXRhICU+JQogICAgcm93d2lzZSgpICU+JQogICAgbXV0YXRlKHNraWxsX2NvdW50ID0gc3VtKGNfYWNyb3NzKGMocHl0aG9uX3luLCBSX3luLCBzcGFyaywgYXdzLCBleGNlbCkpID09IDEpKSAlPiUKICAgIHVuZ3JvdXAoKSAKICAgIAp9CgoKICBkYXRhX3NraWxscyA8LSBmaWx0ZXJfc2tpbGxzKGRhdGFfMSkgJT4lCiAgICBmaWx0ZXIoc2tpbGxfY291bnQgPj0gMCAmIHNraWxsX2NvdW50IDw9IDUpCiAgZ2dwbG90KGRhdGFfc2tpbGxzLCBhZXMoeCA9IGZhY3Rvcihza2lsbF9jb3VudCksIHkgPSBhdmdfc2FsYXJ5KSkgKwogICAgZ2VvbV9ib3hwbG90KCkgKwogICAgdGhlbWVfbWluaW1hbCgpICsKICAgIGxhYnModGl0bGUgPSBwYXN0ZShpLCAiTnVtYmVyIG9mIHNraWxscyB2cyBTYWxhcnkiKSwgCiAgICAgICAgIHggPSAiTnVtYmVyIG9mIFNraWxscyIsIAogICAgICAgICB5ID0gIkF2ZXJhZ2UgU2FsYXJ5KHRob3VzYW5kcykiKSsKICB0aGVtZShwbG90LnRpdGxlID0gZWxlbWVudF90ZXh0KGhqdXN0ID0gMC41KSkKICAKCmBgYApgYGB7cn0KCmRhdGFfcHJvY2Vzc2VkIDwtIGRhdGFfMyAlPiUKICBzZWxlY3QoUmF0aW5nLCBhdmdfc2FsYXJ5LCBqb2Jfc2ltcCwgam9iX3N0YXRlLCBweXRob25feW4sIFJfeW4sIHNwYXJrLCBhd3MsIGV4Y2VsLCBJbmR1c3RyeSwgUmV2ZW51ZSkKCiMgY29udmVydCBjYXRlZ29yaWNhbCB2YXJpYWJsZXMgdG8gZmFjdG9ycwpkYXRhX3Byb2Nlc3NlZCRqb2Jfc2ltcCA8LSBhcy5mYWN0b3IoZGF0YV9wcm9jZXNzZWQkam9iX3NpbXApCmRhdGFfcHJvY2Vzc2VkJGpvYl9zdGF0ZSA8LSBhcy5mYWN0b3IoZGF0YV9wcm9jZXNzZWQkam9iX3N0YXRlKQpkYXRhX3Byb2Nlc3NlZCRJbmR1c3RyeSA8LSBhcy5mYWN0b3IoZGF0YV9wcm9jZXNzZWQkSW5kdXN0cnkpCmRhdGFfcHJvY2Vzc2VkJFJldmVudWUgPC0gYXMuZmFjdG9yKGRhdGFfcHJvY2Vzc2VkJFJldmVudWUpCmRhdGFfcHJvY2Vzc2VkJHB5dGhvbl95biA8LSBhcy5mYWN0b3IoZGF0YV9wcm9jZXNzZWQkcHl0aG9uX3luKQpkYXRhX3Byb2Nlc3NlZCRSX3luIDwtIGFzLmZhY3RvcihkYXRhX3Byb2Nlc3NlZCRSX3luKQpkYXRhX3Byb2Nlc3NlZCRzcGFyayA8LSBhcy5mYWN0b3IoZGF0YV9wcm9jZXNzZWQkc3BhcmspCmRhdGFfcHJvY2Vzc2VkJGF3cyA8LSBhcy5mYWN0b3IoZGF0YV9wcm9jZXNzZWQkYXdzKQpkYXRhX3Byb2Nlc3NlZCRleGNlbCA8LSBhcy5mYWN0b3IoZGF0YV9wcm9jZXNzZWQkZXhjZWwpCgoKbW9kZWwgPC0gbG0oYXZnX3NhbGFyeSB+IFJhdGluZyArIGpvYl9zaW1wICsgam9iX3N0YXRlICsgcHl0aG9uX3luICsgUl95biArIHNwYXJrICsgYXdzICsgZXhjZWwgKyBJbmR1c3RyeSArIFJldmVudWUsIGRhdGEgPSBkYXRhX3Byb2Nlc3NlZCkKCgpzdW1tYXJ5KG1vZGVsKQoKYGBgCmBgYHtyfQojIFJlc2lkdWFscyB2cyBGaXR0ZWQKZ2dwbG90KG1vZGVsLCBhZXMoLmZpdHRlZCwgLnJlc2lkKSkgKwogIGdlb21fcG9pbnQoKSArCiAgZ2VvbV9zbW9vdGgoc2UgPSBGQUxTRSkgKwogIGxhYnModGl0bGUgPSAiUmVzaWR1YWxzIHZzIEZpdHRlZCIsIHggPSAiRml0dGVkIHZhbHVlcyIsIHkgPSAiUmVzaWR1YWxzIikgKwogIHRoZW1lX21pbmltYWwoKQoKIyBOb3JtYWwgUS1RCmdncGxvdChtb2RlbCwgYWVzKHNhbXBsZSA9IC5zdGRyZXNpZCkpICsKICBzdGF0X3FxKCkgKwogIHN0YXRfcXFfbGluZSgpICsKICBsYWJzKHRpdGxlID0gIk5vcm1hbCBRLVEiLCB4ID0gIlRoZW9yZXRpY2FsIFF1YW50aWxlcyIsIHkgPSAiU3RhbmRhcmRpemVkIFJlc2lkdWFscyIpICsKICB0aGVtZV9taW5pbWFsKCkKCiMgU2NhbGUtTG9jYXRpb24gKG9yIFNwcmVhZC1Mb2NhdGlvbikKZ2dwbG90KG1vZGVsLCBhZXMoLmZpdHRlZCwgc3FydChhYnMoLnN0ZHJlc2lkKSkpKSArCiAgZ2VvbV9wb2ludCgpICsKICBnZW9tX3Ntb290aChzZSA9IEZBTFNFKSArCiAgbGFicyh0aXRsZSA9ICJTY2FsZS1Mb2NhdGlvbiIsIHggPSAiRml0dGVkIHZhbHVlcyIsIHkgPSAiU3F1YXJlIFJvb3Qgb2YgU3RhbmRhcmRpemVkIFJlc2lkdWFscyIpICsKICB0aGVtZV9taW5pbWFsKCkKCiMgUmVzaWR1YWxzIHZzIExldmVyYWdlCmdncGxvdChtb2RlbCwgYWVzKC5oYXQsIC5zdGRyZXNpZCkpICsKICBnZW9tX3BvaW50KCkgKwogIGdlb21fc21vb3RoKHNlID0gRkFMU0UpICsKICBsYWJzKHRpdGxlID0gIlJlc2lkdWFscyB2cyBMZXZlcmFnZSIsIHggPSAiTGV2ZXJhZ2UiLCB5ID0gIlN0YW5kYXJkaXplZCBSZXNpZHVhbHMiKSArCiAgdGhlbWVfbWluaW1hbCgpCgpgYGAKYGBge3J9CiMgcmF0aW5nIHZzIGF2ZXJhZ2Ugc2FsYXJ5CmdncGxvdChkYXRhX3Byb2Nlc3NlZCwgYWVzKHggPSBSYXRpbmcsIHkgPSBhdmdfc2FsYXJ5KSkgKwogIGdlb21fcG9pbnQoKSArCiAgZ2VvbV9zbW9vdGgobWV0aG9kID0gImxtIiwgc2UgPSBGQUxTRSwgY29sb3IgPSAiYmx1ZSIpICsKICBsYWJzKHRpdGxlID0gIkVmZmVjdCBvZiBSYXRpbmcgb24gQXZnIFNhbGFyeSIsIHggPSAiUmF0aW5nIiwgeSA9ICJBdmVyYWdlIFNhbGFyeSIpICsKICB0aGVtZV9taW5pbWFsKCkKCiMgam9iIHRpdGxlIHZzIHNhbGFyeQpnZ3Bsb3QoZGF0YV9wcm9jZXNzZWQsIGFlcyh4ID0gam9iX3NpbXAsIHkgPSBhdmdfc2FsYXJ5KSkgKwogIGdlb21fYm94cGxvdCgpICsKICBsYWJzKHRpdGxlID0gIkVmZmVjdCBvZiBKb2IgVGl0bGUgb24gU2FsYXJ5IiwgeCA9ICJKb2IgVGl0bGUiLCB5ID0gIkF2ZXJhZ2UgU2FsYXJ5IikgKwogIHRoZW1lX21pbmltYWwoKSArCiAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoYW5nbGUgPSA0NSwgaGp1c3QgPSAxKSkKCiMgam9iIHN0YXRlIHZzIHNhbGFyeSAKZ2dwbG90KGRhdGFfcHJvY2Vzc2VkLCBhZXMoeCA9IGpvYl9zdGF0ZSwgeSA9IGF2Z19zYWxhcnkpKSArCiAgZ2VvbV9ib3hwbG90KCkgKwogIGxhYnModGl0bGUgPSAiRWZmZWN0IG9mIEpvYiBTdGF0ZSBvbiBBdmcgU2FsYXJ5IiwgeCA9ICJKb2IgU3RhdGUiLCB5ID0gIkF2ZXJhZ2UgU2FsYXJ5IikgKwogIHRoZW1lX21pbmltYWwoKSArCiAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoYW5nbGUgPSA2MCwgaGp1c3QgPSAxKSkKCgoKYGBgCmBgYHtyfQojIHJlZ3Jlc3Npb24gcGxvdCB3aXRoIG51bWJlciBvZiBza2lsbHMgcmVxdXJlZCAKZ2dwbG90KGZpbHRlcmVkX2RhdGEsIGFlcyh4ID0gc2tpbGxfY291bnQsIHkgPSBhdmdfc2FsYXJ5KSkgKwogIGdlb21fcG9pbnQoKSArCiAgZ2VvbV9zbW9vdGgobWV0aG9kID0gImxtIiwgc2UgPSBGQUxTRSwgY29sb3IgPSAiYmx1ZSIpICsKICBsYWJzKHRpdGxlID0gIk51bWJlciBvZiBTa2lsbHMgdnMgU2FsYXJ5IiwgeCA9ICJOdW1iZXIgb2YgU2tpbGxzIiwgeSA9ICJBdmVyYWdlIFNhbGFyeSIpICsKICB0aGVtZV9taW5pbWFsKCkKYGBgCmBgYHtyfQp0YWJsZShkYXRhXzMkc2VuaW9yaXR5KQojIGdyb3VwIGJ5IGluZHVzdHJ5IGFuZCBzZW5pb2l0eSB0byBmaW5kIGF2ZXJhZ2Ugc2FsYXJ5IGZvciBlYWNoIHNlbmlvcml0eSBsZXZlbAppbmR1c3RyeV9zYWxhcnkgPC0gZGF0YV8zICU+JQogIGdyb3VwX2J5KEluZHVzdHJ5LCBzZW5pb3JpdHkpICU+JQogIHN1bW1hcml6ZShhdmdfc2FsYXJ5ID0gbWVhbihhdmdfc2FsYXJ5LCBuYS5ybSA9IFRSVUUpKSAlPiUgdW5ncm91cCgpCgojIHRvcCAxMCBoaWdoZXN0IHNhbGFyaWVzIGZvciBzZW5pb3IgcG9zaXRpb25zCnRvcF9zZW5pb3IgPC0gaW5kdXN0cnlfc2FsYXJ5ICU+JQogIGZpbHRlcihzZW5pb3JpdHkgPT0gInNlbmlvciIpICU+JQogIGFycmFuZ2UoZGVzYyhhdmdfc2FsYXJ5KSkgJT4lIHNsaWNlX2hlYWQobj0xMCkKIyB0b3AgMTAgc2FsYXJpZXMgZm9yIGpyIG9yIG5vbiBzcGVjaWZpZWQgcG9zaXRpb25zCnRvcF9ub25fc2VuaW9yIDwtIGluZHVzdHJ5X3NhbGFyeSAlPiUKICBmaWx0ZXIoc2VuaW9yaXR5ID09ICJqciIgfCBzZW5pb3JpdHkgPT0gIm5hIiApICU+JQogIGFycmFuZ2UoZGVzYyhhdmdfc2FsYXJ5KSkgJT4lIHNsaWNlX2hlYWQobj0xMCkKCgppbmR1c3RyeV9zYWxhcnkgJT4lICBzbGljZSgzOjE4KSAlPiUKZ2dwbG90KGFlcyh4ID0gSW5kdXN0cnksIHkgPSBhdmdfc2FsYXJ5LCBmaWxsID0gc2VuaW9yaXR5KSkgKwogIGdlb21fYmFyKHN0YXQgPSAiaWRlbnRpdHkiLCBwb3NpdGlvbiA9ICJkb2RnZSIpICsKICBsYWJzKHRpdGxlID0gIlNhbGFyeSBieSBJbmR1c3RyeSBhbmQgU2VuaW9yaXR5IiwgeCA9ICJJbmR1c3RyeSIsIHkgPSAiQXZlcmFnZSBTYWxhcnkiKSArCiAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoYW5nbGUgPSA0NSwgaGp1c3QgPSAxKSkKCmBgYApgYGB7cn0KIyB0aGVyZSBpcyBvbmUgZW50cnkgaW4gY29sdW1uIGpvYl9zdGF0ZSB0aGF0IGhhcyBMb3MgQW5nZWxlcyBpbnN0ZWFkIG9mIENBLHNvIHdlIG5lZWQgdG8gZml4IHRoYXQKZGF0YV8xIDwtIGRhdGFfMSAlPiUKICBtdXRhdGUoam9iX3N0YXRlID0gaWZlbHNlKGpvYl9zdGF0ZSA9PSAiIExvcyBBbmdlbGVzIiB8IGpvYl9zdGF0ZSA9PSAiQ0EiLCAiIENBIiwgam9iX3N0YXRlKSkgCgojIGNoZWNrCnVuaXF1ZShkYXRhXzEkam9iX3N0YXRlKQpgYGAKCgoKCmBgYHtyfQpoaWdoZXN0X3NhbGFyeSA8LSBkYXRhXzEgJT4lCiAgYXJyYW5nZShkZXNjKGF2Z19zYWxhcnkpKQojIHBsb3Qgb2YgaGlnaGVzdCBzYWxhcmllcyBwZXIgU3RhdGUKaGlnaGVzdF9zYWxhcnkgJT4lIAogIGdncGxvdChhZXMoeCA9IGpvYl9zdGF0ZSwgeSA9IG1heF9zYWxhcnkpKSArCiAgZ2VvbV9iYXIoc3RhdCA9ICJpZGVudGl0eSIsIGZpbGwgPSAib3JhbmdlIikgICsKICBsYWJzKHRpdGxlID0gIkhpZ2hlc3QgU2FsYXJpZXMgYnkgU3RhdGUiLCB4ID0gIlN0YXRlIiwgeSA9ICJTYWxhcnkiKSArCiAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoYW5nbGUgPSA5MCwgaGp1c3QgPSAwLjUpKQoKIyBwbG90IG9mIGhpZ2hlc3Qgc2FsYXJpZXMgcGVyIENpdHkKaGlnaGVzdF9zYWxhcnkgJT4lIAogIHNsaWNlX2hlYWQobj0zMCkgJT4lCiAgZ2dwbG90KCBhZXMoeCA9IExvY2F0aW9uLCB5ID0gbWF4X3NhbGFyeSkpICsKICBnZW9tX2JhcihzdGF0ID0gImlkZW50aXR5IiwgZmlsbCA9ICJwdXJwbGUiKSArCiAgbGFicyh0aXRsZSA9ICJIaWdoZXN0IFNhbGFyaWVzIGJ5IENpdHkiLCB4ID0gIlN0YXRlIiwgeSA9ICJTYWxhcnkiKSArCiAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoYW5nbGUgPSA0NSwgaGp1c3QgPSAxKSkKCmBgYApgYGB7cn0KdGFibGUoZGF0YV8zJFR5cGUub2Yub3duZXJzaGlwKQpmaWx0ZXJlZF9kYXRhIDwtIGRhdGFfMyAlPiUKICBmaWx0ZXIoVHlwZS5vZi5vd25lcnNoaXAgJWluJSBjKCJDb21wYW55IC0gUHJpdmF0ZSIsICJDb21wYW55IC0gUHVibGljIiwiR292ZXJubWVudCIsICJOb25wcm9maXQgT3JnYW5pemF0aW9uIikpICU+JQogIGdyb3VwX2J5KFR5cGUub2Yub3duZXJzaGlwKSAlPiUKICBzdW1tYXJpemUoYXZnX3NhbGFyeSA9IG1lYW4oYXZnX3NhbGFyeSkpCgpmaWx0ZXJlZF9kYXRhICU+JQpnZ3Bsb3QoYWVzKHggPSBUeXBlLm9mLm93bmVyc2hpcCwgeSA9IGF2Z19zYWxhcnksIGZpbGwgPSBUeXBlLm9mLm93bmVyc2hpcCkpICsKICBnZW9tX2JhcihzdGF0ID0gImlkZW50aXR5IikgKwogIGxhYnModGl0bGUgPSAiU2FsYXJ5IGJ5IE93bmVyc2hpcCBUeXBlIiwgeCA9ICJUeXBlIG9mIE93bmVyc2hpcCIsIHkgPSAiQXZlcmFnZSBTYWxhcnkiKSArCiAgdGhlbWVfbWluaW1hbCgpICsKICB0aGVtZShheGlzLnRleHQueCA9IGVsZW1lbnRfdGV4dChhbmdsZSA9IDQ1LCBoanVzdCA9IDEpKQoKYGBgCmBgYHtyfQojIGF2ZXJhZ2UgUGF5IHBlciBTdGF0ZSAoZm9yIDIwIFN0YXRlcykKZGF0YV8xICU+JQogIGdyb3VwX2J5KGpvYl9zdGF0ZSkgJT4lCiAgc3VtbWFyaXplKGF2Z19zYWxhcnkgPSBtZWFuKGF2Z19zYWxhcnkpKSAlPiUKICBhcnJhbmdlKGRlc2MoYXZnX3NhbGFyeSkpICU+JQogIHNsaWNlX2hlYWQobj0yMCkgJT4lCiAgZ2dwbG90KGFlcyh4ID0gcmVvcmRlcihqb2Jfc3RhdGUsIGF2Z19zYWxhcnkpLCB5ID0gYXZnX3NhbGFyeSwgZmlsbCA9IGpvYl9zdGF0ZSkpICsKICBnZW9tX2JhcihzdGF0ID0gImlkZW50aXR5IikgKwogIGxhYnModGl0bGUgPSAiQXZlcmFnZSBTYWxhcnkgYnkgU3RhdGUiLCB4ID0gIlN0YXRlIiwgeSA9ICJBdmVyYWdlIFNhbGFyeSIpICsKICB0aGVtZV9taW5pbWFsKCkgKwogIHRoZW1lKGF4aXMudGV4dC54ID0gZWxlbWVudF90ZXh0KGFuZ2xlID0gOTAsIGhqdXN0ID0gMSkpCgpgYGAKYGBge3J9CiMgcmVtb3ZlIHN0YXRlcyB3aXRoIDEgZW50cnkgb2Ygd2l0aCByZXBlYXRpbmcgZW50cmllcwpzdGF0ZV9kYXRhIDwtIGRhdGFfMSAlPiUKICAgIGZpbHRlcighKGpvYl9zdGF0ZSAlaW4lIGMoIiBLUyIsICIgREUiLCAiIFNDIiwgIiBSSSIpKSkgCiMgZnVuY3Rpb24gdGhhdCBjcmVhdGVzIHZpb2xpbiBwbG90cyBmb3IgZWFjaCBzdGF0ZQpzdGF0ZV9zYWxhcnkgPC0gZnVuY3Rpb24oc3RhdGVfbmFtZSkgewogIHN0YXRlX2RhdGEgPC0gZGF0YV8xICU+JQogICAgZmlsdGVyKGpvYl9zdGF0ZSA9PSBzdGF0ZV9uYW1lKSAKCiAgIGdncGxvdChzdGF0ZV9kYXRhLCBhZXMoeCA9IGpvYl9zdGF0ZSwgeSA9IGF2Z19zYWxhcnksIGZpbGwgPSBqb2Jfc3RhdGUpKSArCiAgICBnZW9tX3Zpb2xpbigpICsKICAgIGxhYnModGl0bGUgPSBwYXN0ZSgiU2FsYXJ5IFJhbmdlIGluIiwgc3RhdGVfbmFtZSksIHggPSAiU3RhdGUiLCB5ID0gIlNhbGFyeSIpICsKICAgIHRoZW1lX21pbmltYWwoKSArCiAgICB0aGVtZShheGlzLnRleHQueCA9IGVsZW1lbnRfdGV4dChhbmdsZSA9IDQ1LCBoanVzdCA9IDEpKSAKCn0KIyBzZXBhcmF0ZSBwbG90cyBmb3IgZWFjaCBzdGF0ZQpzdGF0ZV9wbG90cyA8LSBsYXBwbHkodW5pcXVlKHN0YXRlX2RhdGEkam9iX3N0YXRlKSwgc3RhdGVfc2FsYXJ5KQpzdGF0ZV9wbG90cwpgYGAKYGBge3J9CiMgY2hlY2sgd2h5IGNlcnRhaW4gcGxvdHMgZGlkbid0IHdvcmsgCndyb25nX2VudHJpZXMgPC0gZGF0YV8xICU+JQogIGZpbHRlcihqb2Jfc3RhdGUgPT0gIiBLUyIgfCBqb2Jfc3RhdGUgPT0iIERFIiB8am9iX3N0YXRlID09ICIgU0MiIHxqb2Jfc3RhdGUgPT0gIiBSSSIpCnByaW50KHdyb25nX2VudHJpZXMpCgpgYGAKYGBge3J9CiMgY2hlY2sgYWxsIHRoZSB1bmlxdWUgdmFsdWVzIGZvciBSZXZlbnVlIAojIHRhYmxlKGRhdGFfMyRSZXZlbnVlKQoKIyByZXZlbnVlIGNhdGVnb3JpZXMgd2l0aCB0aGUgbW9zdCBlbnRyaWVzIApyZXZlbnVlX2NhdGVnb3JpZXMgPC0gYygKICAiJDUwIHRvICQxMDAgbWlsbGlvbiAoVVNEKSIsCiAgIiQxMDAgdG8gJDUwMCBtaWxsaW9uIChVU0QpIiwKICAiJDUwMCBtaWxsaW9uIHRvICQxIGJpbGxpb24gKFVTRCkiLAogICIkMSB0byAkMiBiaWxsaW9uIChVU0QpIiwKICAiJDEwKyBiaWxsaW9uIChVU0QpIgopCgojIGZpbHRlciBkYXRhIGFuZCBjcmVhdGUgcGxvdCAKZGF0YV8zICU+JSBmaWx0ZXIoUmV2ZW51ZSAlaW4lIHJldmVudWVfY2F0ZWdvcmllcykgJT4lCiAgZ2dwbG90KGFlcyh4ID0gUmV2ZW51ZSwgeSA9IGF2Z19zYWxhcnksIGZpbGwgPSBSZXZlbnVlKSkgKwogIGdlb21fYm94cGxvdCgpICsKICBsYWJzKAogICAgdGl0bGUgPSAiU2FsYXJ5IERpc3RyaWJ1dGlvbiBieSBDb21wYW55IFJldmVudWUiLAogICAgeCA9ICJDb21wYW55IFJldmVudWUiLAogICAgeSA9ICJBdmVyYWdlIFNhbGFyeSIKICApICsKICB0aGVtZV9taW5pbWFsKCkgKwogIHRoZW1lKGF4aXMudGV4dC54ID0gZWxlbWVudF90ZXh0KGFuZ2xlID0gNDUsIGhqdXN0ID0gMSksCiAgICAgICAgIHBsb3QudGl0bGUgPSBlbGVtZW50X3RleHQoc2l6ZSA9IDEyLCBmYWNlID0gImJvbGQiLCB2anVzdCA9IDIpLAogICAgICAgIHBsb3QubWFyZ2luID0gbWFyZ2luKDUsMCwwLDEwKSwKICAgICAgICBsZWdlbmQudGV4dCA9IGVsZW1lbnRfdGV4dChzaXplID0gOSkKICAgICAgKQpgYGAKYGBge3J9CgoKYGBgCgoKYGBge3J9CnVuaXF1ZShkYXRhXzQkd29ya195ZWFyKQp0YWJsZShkYXRhXzEkU2l6ZSkKdGFibGUoZGF0YV80JHdvcmtfeWVhcikKYGBgCmBgYHtyfQoKYGBgCgpgYGB7cn0KCmBgYAoKCgpgYGB7cn0KdGFibGUoZGF0YV8zJFJldmVudWUpCgpgYGAKCmBgYHtyfSAKIyBmaW5kIGFsbCB0aGUgdW5pcXVlIGpvYiB0aXRsZXMgZnJvbSBkYXRhXzMKdW5pcXVlKGRhdGFfMyRqb2Jfc2ltcCkKIyBzZWUgaG93IG1hbnkgb2YgZWFjaCB1bmlxdWUgam9iIHRpdGxlIHRoZXJlIGFyZSAKdGFibGUoZGF0YV8zJGpvYl9zaW1wKQpgYGAKCgoKYGBge3J9CgpgYGAKCmBgYHtyfQoKYGBgCgpgYGB7cn0KCmBgYAoKYGBge3J9CgpgYGAKCmBgYHtyfQoKYGBgCgpgYGB7cn0KCmBgYAoK